#andy harris
#aug 24, 2014
#Do Eldoret south polling station level analysis
library(foreach)
library(doMC)
registerDoMC(14)
library(stringr)
library(corpcor)
library(quadprog)
library(data.table)
library(ggplot2)

clean.func <- function(x){
	x <- toupper(x)
	x <- gsub(x, pattern = ">", replacement = "", fixed = T)
	x <- gsub(x, pattern = "]", replacement = "", fixed = T)
	x <- gsub(x, pattern = "[", replacement = "", fixed = T)
	x <- gsub(x, pattern = "\\", replacement = "", fixed = T)
	x <- gsub(x, pattern = "|", replacement = "", fixed = T)
	x <- gsub(x, pattern = "0", replacement = "O", fixed = T)
	x <- gsub(x, pattern = "[0-9]", replacement = "")
	x <- gsub(x, pattern = "'", replacement = "")
	x <- gsub(x, pattern = "'", replacement = "")
	x <- gsub(x, pattern = "´", replacement = "")
	x <- gsub(x, pattern = "[:punct:]{1,}", replacement = "")
	x <- gsub(x, pattern = "[-]{1,}", replacement = "")
	x <- gsub(x, pattern = ".", replacement = "", fixed = T)
	x <- gsub(x, pattern = ",", replacement = "", fixed = T)
	x <- gsub(x, pattern = ")", replacement = "", fixed = T)
	x <- gsub(x, pattern = "(", replacement = "", fixed = T)
	x <- str_trim(x)
	rm.ind <- which(x == "")
	if(length(rm.ind) > 0){
		x <- x[-rm.ind]
	}
	return(x)
}


load(file = "kuresoiVRs.Rdata")
source("nameEstW.R")
source("nameEst.R")

conds <- readRDS('nameConditionals.Rdata')
row.names(conds) <- clean.func(row.names(conds))


diff.store <- vector(length = length(ps07), mode = 'list')
for(ii in 1:length(diff.store)){
	print(ii)
	targ07 <- ps07[[ii]]
	targ10 <- ps10[[ii]]
	diff.store[[ii]] <- foreach(jj = 1:1000, .combine = rbind) %dopar% {
			est07 <- nameEstW(cond = conds, targ = prop.table(table(targ07[sample(1:length(targ07), length(targ07), replace = T)])))
			est07['KIKUYU'] <- est07['KIKUYU'] + est07['MERU']
			est10 <- nameEstW(cond = conds, targ = prop.table(table(targ10[sample(1:length(targ10), length(targ10), replace = T)])))
			est10['KIKUYU'] <- est10['KIKUYU'] + est10['MERU']
			diffest <- est10 - est07
			names(diffest) <- colnames(conds)
			return(diffest)
	}
}

pngmeth <- foreach(ii = 1:length(diff.store), .combine = rbind) %dopar% {
	tmp <- apply(diff.store[[ii]], 2, quantile, c(0.005, 0.5, 0.995))[,c('KALENJIN', 'KIKUYU', 'KISII')]
	out <- c(tmp[,1], tmp[,2], tmp[,3])
	return(out)
}



load(file = "dict.Rdata")
row.names(dictionary1) <- dictionary1$name

tdict <- dictionary1

diff.store.cond <- foreach(ii = 1:length(ps07), .combine = rbind) %dopar% {
		targ07 <- ps07[[ii]]
		targ10 <- ps10[[ii]]
		targ07t <- targ07[which(targ07 %in% tdict$name)]
		targ10t <- targ10[which(targ10 %in% tdict$name)]
		tdict07 <- tdict[targ07t,]
		tdict10 <- tdict[targ10t,]
		tprop07 <- prop.table(table(tdict07$group))
		tprop10 <- prop.table(table(tdict10$group))
		diffest <- c(tprop10 - tprop07)
		diffest <- c(diffest, num = length(targ07t)/length(targ07))
		return(diffest)
}

cond.res <- diff.store.cond

#polling station stats

#prepare data for plotting
tdat <- as.data.frame(pngmeth[,4:6])
names(tdat) <- c('min', 'mid', 'max')
tdat$ps <- 1:nrow(tdat)
tdat$n07 <- ps.stats$n07
tdat$diffs <- ps.stats$diff
tdat$pctkik <- ps.stats$pctkik
kik <- tdat
kik$group <- 'Kikuyu'

tdat <- as.data.frame(pngmeth[,1:3])
names(tdat) <- c('min', 'mid', 'max')
tdat$ps <- 1:nrow(tdat)
tdat$n07 <- ps.stats$n07
tdat$diffs <- ps.stats$diff
tdat$pctkik <- ps.stats$pctkik
kal <- tdat
kal$group <- 'Kalenjin'

tdat <- as.data.frame(pngmeth[,7:9])
names(tdat) <- c('min', 'mid', 'max')
tdat$ps <- 1:nrow(tdat)
tdat$n07 <- ps.stats$n07
tdat$diffs <- ps.stats$diff
tdat$pctkik <- ps.stats$pctkik
kis <- tdat
kis$group <- 'Kisii'


kal <- kal[order(kal$n07),]
row.names(kal) <- kal$ps
kal$ps <- factor(kal$ps, levels = kal$ps)

kik <- kik[order(kik$n07),]
row.names(kik) <- kik$ps
kik$ps <- factor(kik$ps, levels = kik$ps)

kis <- kis[order(kis$n07),]
row.names(kis) <- kis$ps
kis$ps <- factor(kis$ps, levels = kis$ps)

pdata <- rbind(kal, kik, kis)

condr <- data.frame(mid = c(cond.res[,'KIKUYU'], cond.res[,'KALENJIN'], cond.res[,'KISII']), group = c(rep('Kikuyu', nrow(cond.res)), rep('Kalenjin', nrow(cond.res)), rep('Kisii', nrow(cond.res))), ps = c(1:nrow(cond.res), 1:nrow(cond.res), 1:nrow(cond.res)))
condr$ps <- factor(condr$ps, levels = levels(pdata$ps))
p0 <- ggplot()
p1 <- geom_linerange(data = pdata, aes(x = ps, ymin = min, ymax = max), size = 0.8)
p2 <- geom_point(data = condr, aes(x = ps, y = mid), shape = 1)
pout <- p0 + p2 + p1 + theme_bw() + geom_hline(yintercept = 0, size = 0.25) + facet_grid(group~., scales = 'free_y') + xlab('Polling Stations') + ylab('Percent Change') + theme(strip.text = element_text(size = 12), axis.ticks.x = element_blank(), axis.text.x = element_blank())
ggsave(pout, filename = 'kuresoiPlot.pdf', units = 'in', width = 8, height = 5)
